{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Pull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "from pymongo import MongoClient\n",
    "\n",
    "client = MongoClient('localhost:27017')\n",
    "db = client.teamspeed\n",
    "collection = db.forum_teamspeed\n",
    "\n",
    "dataset = []\n",
    "for element in collection.find():\n",
    "    dataset.append(element)\n",
    "    \n",
    "df = pd.DataFrame(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Merge subject and post"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['full_verbatim'] = df.apply(lambda x: x['subject'] + \" \" + x['post'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re, itertools\n",
    "import nltk\n",
    "from nltk.corpus import stopwords \n",
    "\n",
    "def data_cleaning(verbatim):\n",
    "    verbatim = verbatim.strip() #remove whitespaces\n",
    "    verbatim = re.sub(r'<[^<]+?>', ' ', verbatim) #remove html tags\n",
    "    verbatim = re.sub(r'https?:\\/\\/.*[\\r\\n]*', ' ', verbatim, flags=re.MULTILINE) #remove urls\n",
    "    verbatim = re.sub(r'[^\\w\\s]',' ',verbatim) #remove ponctuation\n",
    "    verbatim = ''.join(''.join(s)[:2] for _, s in itertools.groupby(verbatim)) #Standardize words\n",
    "    verbatim = ' '.join(re.findall('[A-Z][^A-Z]*', verbatim)) #Split attached words\n",
    "    verbatim = verbatim.lower() #Lowercase\n",
    "    verbatim = ' '.join([word for word in verbatim.split() if word not in (stopwords.words('english'))]) #Stopwords\n",
    "    tokens = nltk.word_tokenize(verbatim) #Tokenize\n",
    "    return(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['cleaned'] = df.apply(lambda x: data_cleaning(x['full_verbatim']),axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# POS Extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def pos_extraction(tokens):\n",
    "    pos_tokens = nltk.pos_tag(tokens)\n",
    "    return(pos_tokens)\n",
    "\n",
    "\n",
    "def select_pos(pos_tokens,lst_pos):\n",
    "    subset = [pos_token[0] for pos_token in pos_tokens for pos in lst_pos if pos_token[1].startswith(pos)]\n",
    "    return(subset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['pos'] = df.apply(lambda x: pos_extraction(x['cleaned']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['cleaned_nn_jj'] = df.apply(lambda x: select_pos(x['pos'],['NN','JJ']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def make_string(token_lst):\n",
    "    cleaned = [token for token in token_lst if len(token) > 2]\n",
    "    return(\" \".join(cleaned))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['str_cleaned_nn_jj'] = df.apply(lambda x: make_string(x['cleaned_nn_jj']),axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic Modelling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "run -i lda_script.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "loglikelihoods = []\n",
    "\n",
    "for i in range(10,50,10):\n",
    "    model_lda = topic_analysis()\n",
    "    model_lda.get_results(df,'str_cleaned_nn_jj',i)\n",
    "    loglikelihoods.append((i,model_lda.model.loglikelihood()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "element = max(enumerate(loglikelihoods), key=lambda x: x[1])\n",
    "print(\"Optimal number of topics : {} ({})\".format(element[1][0],element[1][1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic Model Results"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "model_lda = topic_analysis()\n",
    "model_lda.get_results(df,'str_cleaned_nn_jj',element)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  'watch steel case time watches black dial new call hours gold movement trade strap special rolex prices available perfect date photos panerai details hour night crystal sapphire stainless water automatic specs complete local delivery wide price chronograph mike power reserve clock sale hand bezel boxes papers size crown brand',\n",
       "  '0.045781035469107556 0.019679776887871852 0.019036184210526316 0.01827817505720824 0.015489273455377574 0.013172339816933639 0.013086527459954233 0.012228403890160182 0.012056779176201373 0.011970966819221968 0.011799342105263158 0.011055635011441647 0.01029762585812357 0.010226115560640733 0.009282179633867276 0.00892462814645309 0.008853117848970252 0.008595680778032036 0.008266733409610984 0.008138014874141876 0.00812371281464531 0.0077232551487414185 0.0076660469107551485 0.007294193363844394 0.007208381006864988 0.007151172768878718 0.007093964530892448 0.006993850114416476 0.006936641876430206 0.006865131578947368 0.006607694508009153 0.006564788329519451 0.006521882151029748 0.006493278032036613 0.0064789759725400456 0.006450371853546911 0.006450371853546911 0.006407465675057208 0.006235840961098398 0.006221538901601831 0.006207236842105263 0.0061929347826086955 0.006178632723112128 0.006092820366132723 0.006007008009153318 0.005921195652173913 0.00587828947368421 0.0057066647597254 0.005520737986270023'),\n",
       " (1,\n",
       "  'race car track series team cars mans time cup world drivers year motorsport driver hours road class lap first audi challenge second event championship circuit season day great porsche hour place record laps motorsports races sports last sport test new course weekend victory competition international american speed rally endurance',\n",
       "  '0.04712434742030507 0.03497688628343104 0.021531557149940213 0.014415157348266107 0.014291203663195964 0.013532898766296264 0.013518315979817423 0.012045454545454545 0.011330898007991367 0.010448639426021524 0.010331977134190802 0.009996573045177473 0.009384096013066176 0.008859115699827924 0.008545585790532854 0.00833413538658967 0.008129976375885905 0.0075320821302534485 0.007050850176451716 0.006985227637296935 0.0068612739522267915 0.00641649896462216 0.006401916178143319 0.006343585032227958 0.006219631347157814 0.006139426021524193 0.005833187505468545 0.005607154315046519 0.005446743663779275 0.005424869484061014 0.005315498585469712 0.005001968676174643 0.004980094496456383 0.0048561408113862396 0.004797809665470878 0.004761352699273777 0.0047467699127949365 0.004644690407443054 0.004535319508851751 0.004528028115612331 0.0044551141832181295 0.004229080992796104 0.003995756409134657 0.003959299442937557 0.0038863855105433546 0.0038353457578674133 0.003828054364627993 0.003784306005191472 0.0036895178930790094'),\n",
       " (2,\n",
       "  'new audi car engine model mercedes production coupe series version jaguar auto show motor rover year range concept lotus liter speed next models power convertible top class drive source litre v10 current first generation geneva debut time horsepower turbo twin seconds rear torque sedan last hybrid details door spy',\n",
       "  '0.04632075801749271 0.023784314868804664 0.019696851311953353 0.01706128279883382 0.013994227405247814 0.012758075801749272 0.012746413994169098 0.01271725947521866 0.011236209912536443 0.011183731778425656 0.009650204081632653 0.008874693877551021 0.008623965014577259 0.008501516034985423 0.008495685131195335 0.008489854227405247 0.008320758017492711 0.008227463556851312 0.008221632653061224 0.00820997084548105 0.007807638483965014 0.007440291545189505 0.006851370262390671 0.006658950437317784 0.0064257142857142855 0.006204139941690962 0.006105014577259475 0.005743498542274053 0.005714344023323615 0.005265364431486881 0.005242040816326531 0.00499131195335277 0.004857201166180758 0.004653119533527697 0.004565655976676385 0.004542332361516035 0.004513177842565597 0.0044373760932944605 0.0041866472303207 0.004128338192419825 0.00402338192419825 0.003976734693877551 0.0039417492711370265 0.003813469387755102 0.0037843148688046645 0.0037609912536443147 0.0037143440233236152 0.0036968513119533526 0.003685189504373178'),\n",
       " (3,\n",
       "  'engine new performance system power high speed car rear front design control sports standard torque wheel fuel drive transmission air vehicle top technology suspension available seconds mph driver weight sport range wheels model dynamic rpm road maximum features low cylinder body chassis series interior active efficiency output comfort electric',\n",
       "  '0.020600220178531557 0.019128035893254954 0.014984109756920825 0.01458166079004627 0.012005987402049115 0.010788254721635138 0.010684396923732027 0.00993402433388205 0.00988728832482565 0.008417700484496627 0.007641363445170872 0.007514137642739562 0.006636539250458273 0.00641843787486174 0.006337948081486829 0.006257458288111918 0.006083496476624207 0.005847219986394629 0.005639504390588407 0.005244844758556584 0.005172144300024406 0.0050942509515970735 0.004909903360319051 0.004891728245686007 0.004655451755456429 0.004613908636295185 0.004577558407029096 0.004439946824807473 0.004325703247114052 0.004115391206360251 0.003985568958981362 0.003969990289295896 0.003910272055501607 0.0038401680419170073 0.003723328019276007 0.0035909093269495406 0.0035311910931552517 0.0035156124234697853 0.0033728079513530075 0.0032455821489216965 0.0032351963691313853 0.003204039029760452 0.0030923918970146074 0.0030794096722767186 0.0030352701081678966 0.0029054478607890076 0.002835343847204408 0.002773029168462541 0.0027626433886722298'),\n",
       " (4,\n",
       "  'turbo stock performance kit car engine factory suspension power exhaust air dyno brake oil system new shark software click rear switzer upgrade package custom tune track parts front street bar sit project werks race information motorsports fuel steel boost ready tech stage rotors springs full upgrades adjustable art complete',\n",
       "  '0.01627679996469576 0.014967601003228867 0.014835210097013114 0.014173255565934349 0.013047932863100449 0.012864056604467457 0.010937033413993718 0.009054140525591897 0.008590772353836762 0.00844367134693037 0.007686101161362449 0.0074728047013481805 0.007171247637190076 0.0068991107744132505 0.006840270371650694 0.006832915321305374 0.006737299666816219 0.0064798729047300324 0.00614889563919065 0.005803208272960629 0.005486941108111885 0.0054648759570759264 0.0053104198998242145 0.0051191885908459045 0.0048176315266878 0.004729370922543965 0.004685240620472047 0.004677885570126728 0.004560204764601614 0.004464589110112459 0.0044131037576952215 0.004398393657004582 0.004361618405277984 0.0043174881032060665 0.004280712851479468 0.004266002750788829 0.00425864770044351 0.0042218724487169115 0.004185097196990313 0.0040674163914652 0.004037996190083921 0.003949735585940086 0.003942380535594766 0.003846764881105611 0.003846764881105611 0.0038394098307602915 0.0038173446797243327 0.0037585042769617757 0.0036702436728179404'),\n",
       " (5,\n",
       "  'tube video aston martin corvette car nissan ford bugatti pagani vantage new speed veyron v12 top camaro gear zonda mustang test supercar review shelby chevrolet dodge track koenigsegg x202a sport x202c rlm world super z06 huayra cars drag part grand mph motor roadster drive chris road garage cadillac fastest',\n",
       "  '0.07774899009462677 0.03718676332245034 0.03413214542637376 0.030634829284488966 0.02007647612196337 0.018792651209119587 0.018515964805489463 0.01719893752421006 0.015483481821703284 0.014288196558021141 0.01369055392618007 0.011964030767528085 0.011786951469204806 0.010813015328426762 0.010768745503845942 0.010669138398539097 0.009938686292955566 0.009927618836810361 0.009252504011952854 0.009163964362791214 0.008256432958884402 0.00810148857285153 0.008090421116706326 0.008046151292125506 0.007448508660284435 0.007216092081235129 0.007182889712799514 0.007149687344363899 0.007050080239057054 0.006861933484588569 0.006839798572298158 0.006651651817829673 0.006529909800232417 0.006297493221183112 0.006286425765037907 0.005832660063084501 0.005688783133196836 0.00563344585247081 0.005622378396325605 0.00558917602788999 0.005423164185711915 0.005345691992695479 0.005301422168114659 0.005091140501355764 0.005069005589065354 0.004670577167837973 0.0045931049748215375 0.004371755851917437 0.004050799623706492'),\n",
       " (6,\n",
       "  'team race year season formula laren grand prix red teams hamilton ferrari bull world driver time last renault new drivers championship news car mercedes alonso sport vettel button second source champion points massa next good title weekend first races lap williams honda circuit test german brawn lewis mark end',\n",
       "  '0.028618368988258005 0.01586623773524208 0.01569252050828374 0.015415859739424161 0.013955348238700338 0.013318385073186426 0.010982853466302074 0.010230078816149268 0.010094965417403892 0.008396396976033456 0.008261283577288081 0.008126170178542705 0.008126170178542705 0.008126170178542705 0.008081132378960914 0.008061830464854432 0.007946018980215538 0.007321923757439279 0.006820073990670741 0.0066592247064500565 0.006150940968312691 0.00585497828534663 0.005655525172912981 0.0056297892874376705 0.005411034260897539 0.005340260575840437 0.005127939520669133 0.005050731864243204 0.004806240952227763 0.004735467267170661 0.004574617982949976 0.004516712240630529 0.004407334727360463 0.004149975872607367 0.003796107447321859 0.0037832395045842046 0.0037639375904777225 0.0037253337622647578 0.003699597876789448 0.0034551069647740067 0.003442239022036352 0.0033585973942415956 0.0033392954801351135 0.003313559594659804 0.003313559594659804 0.0033071256232909763 0.0033006916519221488 0.0032942576805533217 0.003274955766446839'),\n",
       " (7,\n",
       "  'carbon black front fiber interior rear wheels edition car side bentley sport spoiler seats color leather package red new design wheel body exterior white special blue available diffuser door continental trim kit custom parts look lights bumper paint inch options silver full roof vorsteiner grey light finish air exhaust',\n",
       "  '0.04332886941495927 0.028227659837077267 0.025691249074302643 0.022741360157985682 0.01839675388792891 0.017612996790915825 0.0128981115773883 0.012330350530733152 0.011318254751913108 0.011238027647494446 0.011040545544310047 0.010787521599605035 0.0103740434460627 0.00997290792396939 0.00989885213527524 0.009806282399407553 0.00949771661318193 0.009195322142680819 0.009139780301160208 0.009115095038262157 0.008825043199210071 0.008683102937546285 0.008541162675882497 0.007504381634164404 0.0072637003209084176 0.006424401382374722 0.00641823006665021 0.006196062700567761 0.00599240928165885 0.005936867440138238 0.005856640335719575 0.005659158232535176 0.005535731918044927 0.0055048753394223646 0.005399962972105653 0.005214823500370279 0.005054369291532954 0.004980313502838805 0.004930942977042705 0.004807516662552456 0.004628548506541595 0.00449277956060232 0.0043631819303875585 0.004233584300172797 0.00419038509010121 0.004171871142927672 0.003980560355467786 0.0037645643051098495 0.003709022463589237'),\n",
       " (8,\n",
       "  'ferrari tube maserati italia car spider scuderia f430 california video gran cars enzo new alfa italian turismo f40 sound stradale challenge novitec romeo maranello track f12 fiorano supercar ferraris rosso laren test italy top modena engine supercars special sale cabrio red zagato road berlinetta quattroporte v12 horse beautiful yellow',\n",
       "  '0.16277477449162397 0.02823977747744916 0.026636860797686773 0.0243739196027281 0.02391818838985448 0.021246660590250495 0.020618065813873085 0.019376591130527705 0.01725508376025395 0.01588789012163309 0.015385014300531163 0.012949209542068706 0.012697771631517743 0.011126284690574221 0.01037197095892133 0.009476223402583524 0.00946050853317409 0.009240500361441997 0.007904736461640003 0.007873306722821133 0.0077790175063645215 0.007700443159317346 0.007134707860577678 0.007008988905302197 0.006301819781877612 0.0060660967407360844 0.006018952132507778 0.0057989439607756856 0.0057989439607756856 0.005641795266681334 0.005626080397271899 0.005563220919634158 0.005311783009083195 0.005076059967941666 0.005060345098532231 0.004997485620894491 0.004871766665619008 0.004526039538611434 0.004526039538611434 0.004510324669201999 0.004463180060973693 0.004133167803375554 0.0041017380645566835 0.0041017380645566835 0.0040860231951472484 0.004054593456328377 0.003913159631643461 0.0038660150234151552 0.0037245811987302382'),\n",
       " (9,\n",
       "  'porsche turbo carrera new panamera cayenne spyder cayman boxster tube sport rolls car video hybrid stuttgart classic phantom drive cars speedster model auto ghost sports cabriolet models germany targa porsches flat cab gemballa magazine ruf test option macan german special twin time chrono manual generation factory total version royce',\n",
       "  '0.2715310544053924 0.06865978027749814 0.0444406997271706 0.04035555361024788 0.038240031513984335 0.032579186180534285 0.03020104754818284 0.029661224668446624 0.02265811703943625 0.014327336922425993 0.01308720327978874 0.012591149822733839 0.009585649465283553 0.008812389664580323 0.008447644475569368 0.007032433142206854 0.00596737719029486 0.005704760654206971 0.0056755810390860945 0.00554427277104215 0.00545673392567952 0.005427554310558643 0.00520870719715207 0.0051649377744707545 0.0050628091215476866 0.00494609066106418 0.004931500853503742 0.004829372200580674 0.004537576049371908 0.004333318743525773 0.0040707022074378835 0.003997753169635692 0.003997753169635692 0.003983163362075254 0.003837265286470871 0.003735136633547803 0.0035454691352621056 0.0034871099050203525 0.003282852599174217 0.0031077749084489573 0.0031077749084489573 0.002976466640405013 0.002947287025284136 0.002772209334558877 0.002728439911877562 0.002699260296756686 0.002553362221152303 0.0025387724135918647 0.002509592798470988'),\n",
       " (10,\n",
       "  'wheels black wheel series finish vellano mercedes new matte front rear lip pics enjoy modulare custom piece c63 guys monoblock tires center design gloss email available silver flickr satin tire white thanks photos audi performance p40 specs v10 please stock set vossen concave pirelli questions s63 chrome spec lips',\n",
       "  '0.1021054414436609 0.043585029614346286 0.027007154661821624 0.01752235190260627 0.015130621336700236 0.015110091460598038 0.014945852451780456 0.014699493938554083 0.013180283106991451 0.013077633726480463 0.013036573974276067 0.011219679939231569 0.009649144417413442 0.00949517034664696 0.008848479249427731 0.008735564930865643 0.008602120736201358 0.008561060983996963 0.008119668647799711 0.007719336063806856 0.007565361993040373 0.007555097054989274 0.007555097054989274 0.007431917798376088 0.007195824223200814 0.007082909904638727 0.006928935833872244 0.006887876081667848 0.006703107196748068 0.0061282706658865324 0.006066681037579939 0.006015356347324445 0.005881912152660159 0.00569714326774038 0.005522639320871699 0.005481579568667304 0.005204426241287635 0.004824623533396977 0.004691179338732691 0.004465350701608516 0.004167667498126649 0.004106077869820056 0.0040547531795645615 0.004003428489309068 0.003972633675155771 0.0039521037990535735 0.003931573922951376 0.0038699842946447825 0.0038597193565936836'),\n",
       " (11,\n",
       "  'phone email new camera available order sale free apple system please price special information click products battery orders video pricing pro mac audio digital shipping contact display quality questions time vehicle pod screen stock service music view data retail product radar full call prices website use parts info navigation',\n",
       "  '0.017087968493355237 0.011395120949678962 0.010536527549649097 0.010424537106166941 0.01037787442138271 0.009901915036583546 0.009089984321337913 0.008604692399581903 0.007914084664775273 0.007671438703897269 0.00754078318650142 0.0070928214125727945 0.006850175451694789 0.00664485963864417 0.006308888308197701 0.005608948036434225 0.005534287740779454 0.005478292519038376 0.005338304464685681 0.0048996752277139025 0.004825014932059131 0.004741022099447515 0.004647696729879051 0.004283727788562043 0.004227732566820965 0.004218400029864119 0.004218400029864119 0.00419040241899358 0.0041810698820367335 0.004143739734209348 0.004115742123338809 0.003966421532029267 0.003947756458115574 0.003826433477676572 0.003826433477676572 0.003826433477676572 0.003714443034194416 0.003686445423323877 0.0036584478124533377 0.00360245259071226 0.003555789905928028 0.0034904621472301033 0.0034717970733164107 0.0034624645363595646 0.0034624645363595646 0.0034344669254890254 0.0034344669254890254 0.0032478161863520984 0.0032104860385247127'),\n",
       " (12,\n",
       "  'car paint detail wheel correction wheels wash cleaner film clear interior swissvax finish uber vehicle full clay polish time new glass pad wax protection water clean spray work black scratches step shots towels adam foam engine process exterior einszett pics client owner bra good tires door deep total yellow',\n",
       "  '0.03139481262404172 0.026228839942405734 0.013931684632447369 0.013250671284585751 0.011956745923648676 0.01076010818383469 0.010692006849048528 0.010584990465813132 0.009894248355839205 0.009446725298672997 0.009310522629100674 0.009125676148966807 0.009028388527843718 0.007559345448885085 0.007374498968751217 0.007296668871852746 0.007102093629606569 0.006956162197921938 0.006537825427092658 0.006508639140755731 0.006343250184846481 0.006275148850060319 0.006265420087948011 0.0062459625637233925 0.006168132466824922 0.006168132466824922 0.006148674942600304 0.0060805736078141425 0.00593464217612951 0.005506576643187921 0.005263357590380201 0.0052536288282678915 0.004961765964898627 0.004728275674203215 0.004699089387866289 0.004562886718293965 0.004553157956181656 0.004407226524497023 0.00434885395182317 0.004309938903373935 0.0042710238549247 0.004222380044363156 0.004037533564229288 0.004008347277892362 0.003940245943106199 0.003901330894656964 0.003862415846207729 0.0038429583219831112 0.003755399462972332'),\n",
       " (13,\n",
       "  'exhaust system sound tube performance armytrix valvetronic mode power titanium high low valves pipe audi systems car tips muffler range sport full control auto pipes race stainless valve loud flow stock cat mid steel mercedes video launch torque note rpm throttle start best email new level open akrapovic quality',\n",
       "  '0.0770424311130648 0.041378548457996905 0.028434051315271963 0.027311963537931198 0.02270793327626495 0.020683548110959443 0.016438123221433034 0.012192698331906624 0.011232561367790297 0.011128450130717442 0.010862388080420147 0.010631029775813802 0.009636189066006525 0.009578349489854938 0.009543645744163987 0.00911563288064225 0.00887270666080559 0.008629780440968928 0.007970409272840848 0.00763493973116165 0.007623371815931332 0.0072300626981005485 0.007195358952409596 0.006744210258427226 0.006674802767045323 0.0065359877842815165 0.006304629479675173 0.006235221988293269 0.005992295768456609 0.005899752446614071 0.005865048700923119 0.0057609374638502645 0.005622122481086458 0.005610554565856142 0.005506443328783287 0.005494875413552969 0.005298220854637577 0.00528665293940726 0.005263517108946626 0.005101566295722185 0.004812368414964255 0.004557874279897277 0.0044074913819031536 0.004372787636212201 0.004199268907757444 0.003956342687920783 0.003944774772690465 0.0038753672813085626 0.0038522314508479282'),\n",
       " (14,\n",
       "  'pics car cars day event photos pictures great guys new show flickr enjoy time post photo weekend last share shots thread thanks nice shoot today week trip please com speed photography everyone night meet member next good many drive email beach fun photoshoot couple friend check cool teamspeed house',\n",
       "  '0.023724702455682218 0.021766182895595843 0.020022885265189288 0.015058432753908073 0.014333852257319339 0.012432725211814249 0.011593359686062945 0.011471400592577712 0.011127045505089998 0.010990738282959446 0.008228723518735069 0.007999153460409926 0.007396532057306426 0.007360661735693122 0.007360661735693122 0.007260224835175872 0.006750866268266961 0.006506948081296497 0.006377814923488604 0.006291726151616675 0.005889978549547675 0.0056891047485131754 0.005603015976641246 0.005531275333414639 0.005373445918316104 0.005308879339412157 0.004756476386567282 0.0045842988428234244 0.004555602585532782 0.004397773170434246 0.004369076913143603 0.004347554720175621 0.004304510334239657 0.004196899369399746 0.003988851504042585 0.00393863305378396 0.0039027627321706567 0.0038525442819120318 0.0038525442819120318 0.0038094998959760673 0.003795151767330746 0.0037808036386854245 0.0036588445452001924 0.0036516704808775315 0.00363732235223221 0.003615800159264228 0.0035512335803602815 0.0035225373230696384 0.003508189194424317'),\n",
       " (15,\n",
       "  'car drive time way much cars road good little first big thing bit right something day track best different speed test great wheel hard feels feel lot power point top doesn sure old second course full end throttle front things last driver gear many get everything work side nothing',\n",
       "  '0.04499641980360066 0.011645513345083723 0.011200939506483699 0.010229171912375679 0.009072493075664108 0.008561036447186204 0.008450876557975578 0.00766795448822863 0.006888966700239205 0.005956541923706409 0.0050241171471736125 0.0049375629485081205 0.004784125959964749 0.004784125959964749 0.004780191678207227 0.004591346153846154 0.004492989109908096 0.004363157811909858 0.00431594643081959 0.004237260795669143 0.00418218085106383 0.0040130067354903695 0.0039461239456124895 0.0038792411557346095 0.0038320297746443415 0.0036864613496160143 0.0035959728691930004 0.003580235742162911 0.003536958642830165 0.003513352952285031 0.0035054843887699864 0.003489747261739897 0.003438601598892107 0.003249756074531034 0.0032418875110159892 0.003190741848168199 0.0031868075664106764 0.0031750047211381092 0.003115990494775274 0.0031081219312602297 0.0030963190859876625 0.0030845162407150952 0.003049107704897394 0.0030412391413823497 0.0030412391413823497 0.00302550201435226 0.0029979620420496037 0.0029507506609593357 0.002868130744051366'),\n",
       " (16,\n",
       "  'lamborghini gallardo aventador game boat super superleggera p560 league tube lambo year club team last goal top murcielago real season players world p640 com barcelona roadster cigarette football player city madrid games time final united best chelsea man side p670 p700 spyder play champions manchester diablo yacht italian arsenal',\n",
       "  '0.07558044399435014 0.033486261759454204 0.024265250646270286 0.01355182154944967 0.012019428084108413 0.010793513311835408 0.00972750046638062 0.008261732803880286 0.008248407643312103 0.008155131519334807 0.007861977986834742 0.007608799936039229 0.007542174133198305 0.007209045118993684 0.007075793513311835 0.006942541907629987 0.006835940623084508 0.0067426644991072145 0.006636063214561736 0.006489486448311702 0.006369560003198038 0.005969805186152493 0.005876529062175199 0.005863203901607014 0.00579657809876609 0.005570050369106947 0.005343522639447805 0.005276896836606881 0.004983743304106814 0.004983743304106814 0.00494376782240226 0.004903792340697705 0.004863816858993151 0.004583988487061269 0.004437411720811236 0.004357460757402127 0.004357460757402127 0.0043441355968339415 0.0043441355968339415 0.004290834954561202 0.004224209151720278 0.00406430722490206 0.00406430722490206 0.0040376569037656905 0.0039976814220611355 0.003971031100924766 0.003931055619220212 0.0039044052980838416 0.0036512272472883295'),\n",
       " (17,\n",
       "  'month base rate residual lease audi vehicle eligible com compare owner loyalty premium sedan quattro fleet incentive dealer financial special programs prestige coupe convertible auto payment rates captive manufacturer formula value payments model months interest cap cost terms sport depreciation bank avant mercedes data residuals help best drive available',\n",
       "  '0.1602517530478792 0.1593294074520794 0.15625492213274683 0.1532395615310937 0.04059514940816158 0.026972814454810982 0.021320491444653353 0.016011091797036667 0.014568448685657525 0.014343774758475529 0.011245639552071139 0.010725342036491775 0.01063074248820462 0.008632327030638428 0.00689406033086192 0.006551136968320976 0.005983539678598035 0.005841640356167299 0.005616966428985301 0.005593316541913512 0.005415942388875093 0.005321342840587936 0.005274043066444358 0.0046946208331855215 0.004647321059041943 0.003322927383021746 0.0032756276088781676 0.003228327834734589 0.003086428512303854 0.0030627786252320644 0.002826279754514172 0.0026370806579398584 0.0024360566178296497 0.002305982238934809 0.002211382690647652 0.002152257972968179 0.0021049581988246005 0.001998533707001549 0.001986708763465654 0.0019748838199297597 0.001868459328106708 0.0018211595539631296 0.001785684723355446 0.0017738597798195512 0.001691085175068289 0.0016792602315323943 0.0016674352879964998 0.0016319604573888158 0.0015491858526375535'),\n",
       " (18,\n",
       "  'car anyone good new guys thanks time something miles people help much great years sure dealer old get many thing last way someone year anything right today cars price nice work need question things little guy pics bad post gun thoughts deal first lot day end idea week know',\n",
       "  '0.02676792701985632 0.016476321432465503 0.01340519486812934 0.012623783967144399 0.012121015713022303 0.012011981633815103 0.011030674920950292 0.007365918369819367 0.006814690524938517 0.006626909610748338 0.006057509419332954 0.00586367105629793 0.00579703911900464 0.005760694425935573 0.005748579528245884 0.005494166676762414 0.005457821983693348 0.0052034091322098784 0.005161006990295967 0.005051972911088766 0.004876306894588275 0.004797560059605297 0.00469458342924294 0.0046158365942599614 0.004579491901190894 0.004561319554656361 0.004482572719673382 0.004440170577759471 0.004391710987000715 0.004391710987000715 0.004155470482051779 0.004094895993603334 0.003858655488654398 0.003858655488654398 0.0038283682444301757 0.0038283682444301757 0.0037193341652229745 0.003640587330239996 0.003616357534860618 0.00359212773948124 0.0035860702906363954 0.003580012841791551 0.0035497255975673286 0.0035376106998776394 0.003446748967204972 0.0033922319276013715 0.003374059581066838 0.0033377148879977707 0.003319542541463237'),\n",
       " (19,\n",
       "  'new cars car world company year design years brand laren automotive sports performance vehicles production market group luxury high sales vehicle customers first many development future technology business unique industry project motor source show history model time models line engineering work american part president global product international service classic',\n",
       "  '0.015664756235548777 0.014479960971450125 0.012503935652088356 0.012405544349879472 0.01210627080566078 0.010310629540348634 0.010253234614060119 0.009195528115314607 0.007408086125186533 0.007354790836490054 0.006645553533067677 0.00630118397533658 0.005940415867237336 0.005813327101884193 0.005801028189108083 0.0055386513832177235 0.00525577638936718 0.005222979288630885 0.0051778832751184795 0.005136886899198111 0.00483761335497942 0.004677727488889982 0.004349756481527033 0.004226767353765927 0.004128376051557042 0.004054582574900379 0.003952091635099457 0.003677415916432987 0.003656917728472803 0.003624120627736508 0.0035544267886718816 0.003538028238303734 0.0034929322247913286 0.003419138748134665 0.003357644184254112 0.003267452157229301 0.003205957593348748 0.0031116659287319 0.003054271002443384 0.0029722782506026468 0.002959979337826536 0.002939481149866352 0.002894385136353946 0.0028861858611698727 0.0028820862235778358 0.002873886948393762 0.002849289122841541 0.002754997458224693 0.002746798183040619')]"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_lda.topics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(10, -21057434.972703733),\n",
       " (20, -20951459.22897517),\n",
       " (30, -20999990.31793885),\n",
       " (40, -21027163.26144508)]"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loglikelihoods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def print_topics(model_lda):\n",
    "    topics = model_lda.topics\n",
    "    dataframe = model_lda.dataframe\n",
    "\n",
    "    for topic in topics:\n",
    "        labels = topic[1].split(\" \")\n",
    "        dx = [float(x) for x in topic[2].split(\" \")]\n",
    "\n",
    "        fig, ax = plt.subplots()\n",
    "        ind = range(0,len(labels))\n",
    "\n",
    "        width = 0.25  \n",
    "\n",
    "        ax.bar(ind,dx ,width, color='b')\n",
    "        ax.set_xticks(np.arange(len(ind)) + width/2)\n",
    "        ax.set_xticklabels(labels,rotation=90)\n",
    "\n",
    "        plt.tight_layout()\n",
    "        #plt.show() #Optional\n",
    "        plt.savefig('topic' + str(topic[0]) + '.png', dpi=300)\n",
    "\n",
    "        subset = dataframe[dataframe['Topic_id']==topic[0]]\n",
    "\n",
    "        print(\"Topic \" + str(topic[0]) + \": \\n\\n\\n\")\n",
    "        for i,row in subset.sample(10).iterrows():\n",
    "            print(row['full_verbatim'] + \"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print_topics(model_lda)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
